2015-07-08

創用 CC 授權條款

Bar Plot

Line Graph

Scatter Plot

IMDB data

movies1 <- movies[!is.na(movies$budget),]
ggplot(movies1, aes(x=budget, y=rating)) + geom_point()

Control the shape & size of points

ggplot(movies1, aes(x=budget, y=rating)) + geom_point(shape=5, size=3)

All point shape types in ggplot2

Grouping: by binary variable

This usually happens accidentally.

ggplot(movies1, aes(x=budget, y=rating, color=Action)) + geom_point()

Grouping: by categarical variable

ggplot(movies1, aes(x=budget, y=rating, color=factor(Action))) + 
    geom_point() + labs(color='Action Movie?')

Multi-grouping

ggplot(movies1, aes(x=budget, y=rating, color=factor(Action), shape=(length > 120))) + 
    geom_point(size=3) + labs(color='Action Movie?')

Fit regression line

ggplot(movies, aes(x=votes, y=rating)) + geom_point() +
    stat_smooth(method=lm, level=.95) # add se=FALSE to disable CI

The default is a polynomial fit

ggplot(movies, aes(x=votes, y=rating)) + geom_point() + stat_smooth()

Fitting is smart to align with grouping

ggplot(movies1, aes(x=budget, y=rating, color=factor(Action))) + 
    geom_point() + labs(color='Action Movie?') + stat_smooth(method=lm, se=FALSE)

What if the model is pre-computed?

lm_model <- lm(rating ~ budget, data=movies1)
ggplot(movies1, aes(x=budget, y=rating)) + geom_point() +
    geom_line(aes(x=budget, y=lm_model$fitted.values), color='blue')

Scatter plot "as is": Using geom_text

starmovies <- movies[movies$votes > mean(movies$votes),]
starmovies <- starmovies[order(-starmovies$rating),][1:10,]
ggplot(starmovies, aes(x=votes, y=rating)) + geom_point() + geom_text(aes(label=title))

Fine-tune

ggplot(starmovies, aes(x=votes, y=rating)) + geom_point(color='red') + 
    geom_text(aes(label=title), hjust=0, vjust=0, angle=20) +
    xlim(0, max(starmovies$votes)*2) +
    ylim(min(starmovies$rating), 9.2)

Which Type of Film Cost the Most in Average?

We only choose the movies with single type to simplify the question.

movietype <- colnames(movies)[18:24]
movies1_singletype <- movies1[rowSums(movies1[, movietype]) == 1,] # remove multi-typed
movietype_alt <- c(movietype[length(movietype)], movietype[-length(movietype)]) 
# convert multiple dummies into one factor as grouping var
# a little matrix operation will do the trick
dummies <- as.matrix(movies1_singletype[, movietype_alt])
movies1_singletype$Type <- factor(dummies %*% (1:length(movietype_alt)), labels=movietype_alt)

# Compute the Average Budget of Each Type
tapply(movies1_singletype$budget, movies1_singletype$Type, mean)
##       Short      Action   Animation      Comedy       Drama Documentary 
##    396133.1  32698189.6  32311451.6  11921970.6  10456690.5    729704.8 
##     Romance 
##   5603688.0

Determine the variation

The first factor level of movietype, Short, is represented as the intercept term.

lmfit <- lm(as.formula("budget ~ Type"), movies1_singletype)
summary(lmfit)$coef
##                   Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)       396133.1    1715935  0.2308556 8.174455e-01
## TypeAction      32302056.5    2062063 15.6649187 7.005190e-53
## TypeAnimation   31915318.5    4157723  7.6761525 2.316874e-14
## TypeComedy      11525837.5    1888686  6.1025705 1.202481e-09
## TypeDrama       10060557.4    1820528  5.5261750 3.604075e-08
## TypeDocumentary   333571.7    2881175  0.1157763 9.078389e-01
## TypeRomance      5207554.9    3713295  1.4024082 1.609149e-01

Another way to estimate the coefficients

The last predictor, Short is combined into the intercept term.

# mean(movies1_singletype[movies1_singletype$Animation == 1, 'budget'])
lmfit <- lm(as.formula(paste('budget ~', paste(movietype, collapse=' + '))), 
            movies1_singletype)
summary(lmfit)$coef
##               Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)   396133.1    1715935  0.2308556 8.174455e-01
## Action      32302056.5    2062063 15.6649187 7.005190e-53
## Animation   31915318.5    4157723  7.6761525 2.316874e-14
## Comedy      11525837.5    1888686  6.1025705 1.202481e-09
## Drama       10060557.4    1820528  5.5261750 3.604075e-08
## Documentary   333571.7    2881175  0.1157763 9.078389e-01
## Romance      5207554.9    3713295  1.4024082 1.609149e-01

Draw the regression lines of each type

What is the association between cost and rating, conditional on type?

movies1_reg_plot <- ggplot(movies1_singletype, aes(x=budget, y=rating, color=Type)) + 
  geom_point(shape=1) +
  
  # set fullrange=T will extend the fitted line outside the sample range
  stat_smooth(method=lm, se=FALSE, fullrange=FALSE, size=1.5) +
  
  # color is the grouping interface, hence scale_color_*
  scale_color_discrete(name='Movie Type: # of samples', 
                       labels=paste(levels(movies1_singletype$Type), ': ', 
                                    table(movies1_singletype$Type)))

Output

movies1_reg_plot

Exercise: Temprature and RH

Draw a scatter plot of Temprature and RH in Taipei, Taichung, Kaoshiung and Hualien in 2013-05, 2014-05 and 2015-05. Furthermore, each location has its own color and regression line.

Hint: Use following code to read all data and slice the data you want.

source("R/get_weather_all.R")
weather_all <- get_weather_all()
may <- weather_all[weather_all$month == "May",]

Answer

ggplot(may, aes(x=RH, y=Temperature, color=location)) + 
    geom_point(shape=1) + 
    stat_smooth(method = lm, size = 1.5)

The regression problem behind the scene

interact_terms <- paste(paste(movietype, '*budget', sep=''), collapse=' + ')
lmfit <- lm(as.formula(paste('rating ~', interact_terms)), movies1_singletype)
tail(summary(lmfit)$coef)
##                         Estimate   Std. Error     t value  Pr(>|t|)
## Action:budget       1.580218e-08 3.331784e-08  0.47428594 0.6353367
## budget:Animation    8.246542e-09 3.379426e-08  0.24402199 0.8072334
## budget:Comedy      -6.222262e-10 3.337127e-08 -0.01864557 0.9851253
## budget:Drama        1.295424e-08 3.333295e-08  0.38863176 0.6975810
## budget:Documentary -8.504501e-08 9.010076e-08 -0.94388788 0.3453164
## budget:Romance     -3.983138e-08 4.086954e-08 -0.97459824 0.3298521
  • None of the interactive term is statistically significant, indeed
  • Visualization != Analysis (Our eyes were not born to work on numbers.)
  • Plots can be easily manipluated to be misleading, accidentally or on purpose

References

Bonus: Annotation

Annotation

plot(movies1$budget, movies1$rating) # base solution
abline(h=median(movies1$rating), col='red')
text(x=max(movies1$budget)*.9, y=median(movies1$rating), 
     labels='Median of Rating', col='red', pos=1)

Annotation: Add lines

brggp <- ggplot(movies1, aes(x=budget, y=rating)) + geom_point() 
brggp + geom_hline(yintercept=median(movies1$rating)) # ?geom_abline for general setup
# brggp + geom_hline(data=movies1, aes(yintercept=median(rating)))  # the same
# brggp + geom_hline(aes(yintercept=median(movies1$rating)))        # the same

Annotation: Add (single) texts

brggp + geom_hline(yintercept=median(movies1$rating), color='red') + 
  annotate('text', x=Inf, y=median(movies1$rating), 
           label='Medaion of Rating', color='red', vjust=1.2, hjust=1)
# Don't use geom_text for single annotation to avoid overplotting

Annotation: Add segments

shaw <- movies1[grep('Shawshank Redemption', movies1$title, fixed=TRUE),]
brggp + annotate('segment', xend=shaw$budget, yend=shaw$rating, x=Inf, y=-Inf,
                 arrow=grid::arrow(), color='red') +
  annotate('text', label='The Shawshank Redemption?', x=Inf, y=-Inf,
           hjust=1.5, vjust=-1, color='red')

Annotation: Add shaded area

yearcount <- aggregate(title ~ year, data=movies, FUN=length)
ggplot(yearcount, aes(x=year, y=title)) + geom_line() +
  annotate('rect', xmin=1990, xmax=2000, ymin=-Inf, ymax=Inf, fill='blue', alpha=.25)

Bonus: Facet / Multi-plotting

Facet: Single grouping

gg <- ggplot(movies1_singletype, aes(x=rating, y=..density..)) + geom_bar()
gg + facet_grid(Action ~ .) # Plot with grouping variable in different window (Vertical)

Facet: Single grouping

# Plot with grouping variable in different window (Horizontal)
gg + facet_grid(. ~ Action) 

Facet: Multiple grouping

movies1_singletype$modern <- (movies1_singletype$year > 2000)
ggplot(movies1_singletype, aes(x=rating, y=..density..)) + 
  geom_bar() + facet_grid(modern ~ Action)

Facet: Multi-layer grouping

movies1_singletype$rated <- (movies1_singletype$mpaa != '')
ggplot(movies1_singletype, aes(x=rating, color=modern)) + 
  geom_line(stat="density") + facet_grid(Type ~ rated)

Facet: Change labels

  • Way 1: Change the grouping var in data.frame to reflect the alternate labels
  • Way 2: Write customized labeller function for facet_grid
print(label_value) # the default labeller plugg in facet_grid(..., labeller)
## function (variable, value) 
## as.character(value)
## <environment: namespace:ggplot2>
mylabeller <- function(variable, value){
  if ( variable=='rated' ) 
    value <- ifelse(value == TRUE, 'Rated Movies', 'Unrated Movies')
  else if ( variable=='Type' ) 
    as.character(value)
}

Facet: Change labels

ggplot(movies1_singletype, aes(x=rating, color=modern)) +
  geom_line(stat="density") + facet_grid(Type ~ rated, labeller=mylabeller)

Multi-plotting by gridExtra (1/3)

library(gridExtra)
drawPoint <- function(i) {
  ggplot(data.frame(x=1, y=1), aes(x=x,y=y)) + 
    geom_point(shape=i, size=5) +
    ggtitle(sprintf('shape=%s',i)) + 
    theme(axis.text.x=element_blank(), axis.text.y=element_blank()) +
    xlab(NULL) + ylab(NULL)
  }
drawPoint(25)

Multi-plotting by gridExtra (2/3)

symbol_points <- mapply(drawPoint, 1:25, SIMPLIFY=FALSE)
symbols <- do.call(arrangeGrob, symbol_points)
symbols

Multi-plotting by gridExtra (3/3)